Here we compare the count results of CellRanger versus scUTRquant using the 10X Chromium 3’-end demonstration data (v2 and v3). Specifically, we check for any systematic differences in genes.
library(SingleCellExperiment)
library(DropletUtils)
library(GenomicRanges)
library(plyranges)
library(rtracklayer)
library(tidyverse)
library(ggbeeswarm)
library(magrittr)
library(cowplot)
library(Matrix)
SAMPLE_SHEET="metadata/counts_sample_sheet.csv"
GTF="data/gtf/adult.utrome.e3.t200.f0.999.w500.gtf.gz"
df_txs <- import(GTF) %>%
filter(type == 'exon') %>%
mcols() %>% as_tibble() %>%
group_by(gene_id) %>%
summarize(n_exons=length(unique(exon_id)),
n_txs=length(unique(transcript_id))) %>%
mutate(n_exons=cut(n_exons, breaks=c(0,1,2,4,9,Inf),
labels=c("1", "2", "3-4", "5-9", "10+")),
n_txs=cut(n_txs, breaks=c(0,1,2,3,4,Inf),
labels=c("1", "2", "3", "4", "5+")))
gr_genes <- import(GTF, genome="mm10") %>%
filter(type == 'gene') %>%
mutate(gene_id=str_extract(gene_id, "^ENSMUSG[0-9]+")) %>%
`names<-`(.$gene_id)
mcols(gr_genes) <- NULL
df_genes <- readRDS("data/utrs/utrome_genes_annotation.Rds") %>%
merge(df_txs, by='gene_id', all.x=TRUE, all.y=FALSE) %>%
`rownames<-`(str_extract(.$gene_id, "^ENSMUSG[0-9]+"))
## conform 10X cell ids to match scUTRquant cell_id
conform_cell_ids <- function (sample_id, sce) {
colData(sce) %<>%
as_tibble %>%
mutate(bx=str_extract(Barcode, "^[ACGT]{16}"),
cell_id=str_c(sample_id, bx, sep='_')) %>%
select(cell_id, bx) %>%
set_rownames(.$cell_id) %>%
DataFrame()
sce
}
## summarize scUTRquant transcript counts to gene counts
txs_to_genes <- function (sce) {
M_genes_txs <- rowData(sce)$gene_id %>%
fac2sparse %>%
set_rownames(str_extract(rownames(.), "^ENSMUSG[0-9]+"))
sce_g <- SingleCellExperiment(assays=list(counts=M_genes_txs %*% counts(sce)),
colData=colData(sce),
rowRanges=gr_genes[rownames(M_genes_txs)])
rowData(sce_g) <- df_genes[rownames(sce_g),]
sce_g
}
df_counts <- read_csv(SAMPLE_SHEET) %>%
## read SCE files
mutate(sce_sq=map(file_scutrquant, readRDS),
sce_10x=map(file_cellranger, read10xCounts)) %>%
## adjust cell_ids
mutate(sce_10x=map2(sample_id, sce_10x, conform_cell_ids)) %>%
## summarize to gene counts
mutate(sce_sq=map(sce_sq, txs_to_genes))
plot_umis_per_gene_compare_dot <- function (sce_x, sce_y, group=NULL,
label_x="CellRanger UMI Counts Per Gene",
label_y="scUTRquant UMI Counts Per Gene") {
group <- enquo(group)
idx_genes <- intersect(rownames(sce_x), rownames(sce_y))
idx_cells <- intersect(colnames(sce_x), colnames(sce_y))
df <- tibble(x=rowSums(counts(sce_x[idx_genes,idx_cells])),
y=rowSums(counts(sce_y[idx_genes,idx_cells])),
gene_id=idx_genes) %>%
left_join(as_tibble(rowData(sce_y[idx_genes,]), rownames="gene_id2"), by=c("gene_id"="gene_id2"))
ggplot(df, aes(x=x+1, y=y+1)) +
geom_point(alpha=0.3, size=0.1, pch=16) +
geom_abline(slope=1, intercept=0, linetype='dashed') +
facet_wrap(vars(!!group)) +
scale_x_log10() + scale_y_log10() +
labs(x=label_x, y=label_y) +
theme_minimal_grid()
}
plot_umis_per_gene_compare_density <- function (sce_x, sce_y, group=NULL,
include_zeros=FALSE,
label_x="CellRanger",
label_y="scUTRquant") {
group <- enquo(group)
idx_genes <- intersect(rownames(sce_x), rownames(sce_y))
idx_cells <- intersect(colnames(sce_x), colnames(sce_y))
df <- tibble(x=rowSums(counts(sce_x[idx_genes,idx_cells])),
y=rowSums(counts(sce_y[idx_genes,idx_cells])),
gene_id=idx_genes) %>%
filter(include_zeros | (x > 0 | y > 0)) %>%
mutate(ratio=(1+y)/(1+x)) %>%
left_join(as_tibble(rowData(sce_y[idx_genes,]), rownames="gene_id2"), by=c("gene_id"="gene_id2"))
ggplot(df, aes(x=ratio)) +
geom_vline(xintercept=1, linetype='dashed') +
geom_hline(yintercept=0, color='grey') +
geom_density() +
scale_x_log10(limits=c(1/100, 100)) +
facet_wrap(vars(!!group), ncol=1) +
labs(x=sprintf("%s/%s UMI Counts Per Gene", label_y, label_x), y="Density") +
theme_minimal_vgrid()
}
plot_umis_per_gene_compare_violin <- function (sce_x, sce_y, group=NULL,
include_zeros=FALSE,
min_counts=0,
label_x="CellRanger",
label_y="scUTRquant") {
group <- enquo(group)
idx_genes <- intersect(rownames(sce_x), rownames(sce_y))
idx_cells <- intersect(colnames(sce_x), colnames(sce_y))
df_rd <- rowRanges(sce_y[idx_genes,]) %>%
as.data.frame(row.names=names(.)) %>%
as_tibble(rownames='gene_id2')
df <- tibble(x=rowSums(counts(sce_x[idx_genes,idx_cells])),
y=rowSums(counts(sce_y[idx_genes,idx_cells])),
gene_id=idx_genes) %>%
filter(include_zeros | (x > min_counts | y > min_counts)) %>%
mutate(ratio=(1+y)/(1+x)) %>%
left_join(df_rd, by=c("gene_id"="gene_id2"))
ggplot(df, aes(x=!!group, y=ratio)) +
geom_hline(yintercept=1, linetype='dashed') +
geom_violin(draw_quantiles=c(0.25,0.5,0.75)) +
scale_y_log10() +
coord_cartesian(ylim=c(1/10, 10)) +
labs(y=sprintf("%s/%s UMI Counts Per Gene", label_y, label_x)) +
theme_bw()
}
df_counts %>%
transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_dot,
group=atlas.utr_type)) %>%
deframe() %>% {
for (id in names(.)) {
## print locally in this document
print(.[[id]] + ggtitle(id))
}
}
df_counts %>%
transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin,
group=atlas.utr_type, include_zeros=FALSE)) %>%
deframe() %>% {
for (id in names(.)) {
## print locally in this document
print(.[[id]] + ggtitle(id))
}
}
df_counts %>%
transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin,
group=n_txs, include_zeros=FALSE)) %>%
deframe() %>% {
for (id in names(.)) {
## print locally in this document
print(.[[id]] + ggtitle(id))
}
}
df_counts %>%
transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin,
group=n_exons, include_zeros=FALSE)) %>%
deframe() %>% {
for (id in names(.)) {
## print locally in this document
print(.[[id]] + ggtitle(id))
}
}
df_counts %>%
transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin,
group=seqnames, include_zeros=FALSE)) %>%
deframe() %>% {
for (id in names(.)) {
## print locally in this document
print(.[[id]] + ggtitle(id))
}
}
df_counts %>%
transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin,
group=strand, include_zeros=FALSE)) %>%
deframe() %>% {
for (id in names(.)) {
## print locally in this document
print(.[[id]] + ggtitle(id))
}
}
df_counts %>%
transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin,
group=has_ipa, include_zeros=FALSE)) %>%
deframe() %>% {
for (id in names(.)) {
## print locally in this document
print(.[[id]] + ggtitle(id))
}
}
df_counts %>%
transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin,
group=cut_number(atlas.ncelltypes_gene, n=6),
include_zeros=FALSE)) %>%
deframe() %>% {
for (id in names(.)) {
## print locally in this document
print(.[[id]] + ggtitle(id))
}
}
df_counts %>%
transmute(sample_id, g=map2(sce_10x, sce_sq, plot_umis_per_gene_compare_violin,
group=cut_number(atlas.ncelltypes_gene, n=8),
include_zeros=FALSE, min_counts=50)) %>%
deframe() %>% {
for (id in names(.)) {
## print locally in this document
print(.[[id]] + ggtitle(id))
}
}
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS/LAPACK: /Users/mfansler/miniconda3/envs/bioc_3_12/lib/libopenblasp-r0.3.12.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] BSgenome.Mmusculus.UCSC.mm10_1.4.0 BSgenome_1.58.0
## [3] Biostrings_2.58.0 XVector_0.30.0
## [5] Matrix_1.3-2 cowplot_1.1.0
## [7] magrittr_2.0.1 ggbeeswarm_0.6.0
## [9] forcats_0.5.1 stringr_1.4.0
## [11] dplyr_1.0.5 purrr_0.3.4
## [13] readr_1.4.0 tidyr_1.1.3
## [15] tibble_3.1.0 ggplot2_3.3.3
## [17] tidyverse_1.3.0 rtracklayer_1.50.0
## [19] plyranges_1.10.0 DropletUtils_1.10.0
## [21] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [23] Biobase_2.50.0 GenomicRanges_1.42.0
## [25] GenomeInfoDb_1.26.0 IRanges_2.24.0
## [27] S4Vectors_0.28.0 BiocGenerics_0.36.0
## [29] MatrixGenerics_1.2.0 matrixStats_0.58.0
##
## loaded via a namespace (and not attached):
## [1] fs_1.5.0 bitops_1.0-6
## [3] lubridate_1.7.10 httr_1.4.2
## [5] tools_4.0.2 backports_1.2.1
## [7] utf8_1.1.4 R6_2.5.0
## [9] vipor_0.4.5 HDF5Array_1.18.0
## [11] DBI_1.1.1 colorspace_2.0-0
## [13] rhdf5filters_1.2.0 withr_2.4.1
## [15] tidyselect_1.1.0 compiler_4.0.2
## [17] cli_2.3.1 rvest_1.0.0
## [19] xml2_1.3.2 DelayedArray_0.16.0
## [21] scales_1.1.1 digest_0.6.27
## [23] Rsamtools_2.6.0 rmarkdown_2.7
## [25] R.utils_2.10.1 pkgconfig_2.0.3
## [27] htmltools_0.5.1.1 sparseMatrixStats_1.2.0
## [29] highr_0.8 dbplyr_2.1.0
## [31] limma_3.46.0 rlang_0.4.10
## [33] readxl_1.3.1 rstudioapi_0.13
## [35] DelayedMatrixStats_1.12.0 farver_2.1.0
## [37] generics_0.1.0 jsonlite_1.7.2
## [39] BiocParallel_1.24.0 R.oo_1.24.0
## [41] RCurl_1.98-1.2 GenomeInfoDbData_1.2.4
## [43] scuttle_1.0.0 Rcpp_1.0.6
## [45] munsell_0.5.0 Rhdf5lib_1.12.0
## [47] fansi_0.4.2 lifecycle_1.0.0
## [49] R.methodsS3_1.8.1 stringi_1.5.3
## [51] yaml_2.2.1 edgeR_3.32.0
## [53] zlibbioc_1.36.0 rhdf5_2.34.0
## [55] grid_4.0.2 dqrng_0.2.1
## [57] crayon_1.4.1 lattice_0.20-41
## [59] haven_2.3.1 beachmat_2.6.4
## [61] hms_1.0.0 locfit_1.5-9.4
## [63] knitr_1.31 pillar_1.5.1
## [65] reprex_1.0.0 XML_3.99-0.5
## [67] glue_1.4.2 evaluate_0.14
## [69] modelr_0.1.8 vctrs_0.3.6
## [71] cellranger_1.1.0 gtable_0.3.0
## [73] assertthat_0.2.1 xfun_0.20
## [75] broom_0.7.5 beeswarm_0.3.1
## [77] GenomicAlignments_1.26.0 ellipsis_0.3.1